1 Introduction

Multiples factors play an important function during the naive CD4-T cell differentiation where Blimp1 (encoded by Prdm1) and Bcl6 are the masters. In this notebook, we are going to characterize the accessibility pattern of these regulators.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Signac)
library(GenomicRanges)
library(pheatmap)
library(JASPAR2020)
library(TFBSTools)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(data.table)
library(ggpubr)
library(writexl)
library(plyr)
library(stringr)
library(rio)
library(biomaRt)

2.2 Parameters

cell_type <- "CD4_T"

path_to_obj <- paste0(
  here::here("scATAC-seq/results/R_objects/level_5/"),
  cell_type,
  "/04.",
  cell_type,
  "_integration_peak_calling_level_5.rds",
  sep = ""
)

path_to_RNA <- paste0(
  here::here("scRNA-seq/3-clustering/5-level_5/"),
  cell_type,
  "/",
  cell_type,
  "_subseted_annotated_level_5.rds",
  sep = ""
)

upstream <- 2000

color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", 
                    "#5A5156", "#AA0DFE", "#F8A19F", "#F7E1A0",
                    "#1C8356", "#FEAF16", "#822E1C", "#C4451C", 
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA",
                    "#FBE426", "#16FF32",  "black",  "#3283FE",
                    "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3", 
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB", 
                    "#822E1C", "coral2",  "#1CFFCE", "#1CBE4F", 
                    "#3283FE", "#FBE426", "#F7E1A0", "#325A9B", 
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD",
                    "#FEAF16", "#683B79", "#B10DA1", "#1C7F93", 
                    "#F8A19F", "dark orange", "#FEAF16", 
                    "#FBE426", "Brown")

2.3 Functions

plot_dim <- function(seurat, group){
  DimPlot(seurat, 
  group.by = group,
  cols = color_palette,
  pt.size = 0.1,raster=FALSE)
}

mat_heatmap <- function(seurat, features, group,
                        cutree_ncols,cutree_nrows){
  
my_sample_col <- data.frame(Group = rep(c("Naive", "Central Memory",
                                          "Central Memory","Non-Tfh",
                                          "Non-Tfh","Non-Tfh",
                                          "Non-Tfh","Non-Tfh",
                                          "Non-Tfh","Tfh",
                                          "Tfh","Tfh",
                                          "Tfh","Tfh")))

rownames(my_sample_col) = c("Naive", "CM Pre-non-Tfh","CM PreTfh",
                            "T-Trans-Mem","T-Eff-Mem","T-helper",
                            "Eff-Tregs","non-GC-Tf-regs","GC-Tf-regs" ,
                            "B border_Tfh T", "Tfh-LZ-GC",
                            "GC-Tfh-SAP","GC-Tfh-0X40","Tfh-Mem")

annoCol<-list(Group=c("Naive"="grey", "Central Memory"="black", 
                      "Non-Tfh"="red", "Tfh"="yellow"))

avgexpr_mat <- AverageExpression(
features = features,
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = group,
slot = "data")

p1 <- pheatmap(avgexpr_mat$peaks_level_5[,c(rownames(my_sample_col))], 
        scale = "row",
        angle_col = 45,
        show_rownames=T,
        annotation_col = my_sample_col,
        annotation_colors = annoCol,
        border_color = "white",
        cluster_rows = T,
        cluster_cols = F,
        fontsize_col = 10,
        clustering_distance_rows = "euclidean",
        clustering_method = "ward.D2",
        cutree_rows = cutree_nrows) 

return(p1)}

3 Load Gene reference data

ensembl <- useMart(biomart = "ensembl",dataset="hsapiens_gene_ensembl")
Datasets <- listDatasets(ensembl)
Datasets[grep("hsapiens_gene_ensembl",Datasets$dataset),]
##                  dataset              description    version
## 80 hsapiens_gene_ensembl Human genes (GRCh38.p13) GRCh38.p13
hg38.gene.annot <- getBM(attributes = c("ensembl_gene_id",
                                        "entrezgene_id",
                                        "hgnc_symbol",
                                        "chromosome_name",
                                        "start_position",
                                        "end_position",
                                        "strand","band",
                                        "gene_biotype"),
                         mart = ensembl)

## Arrange also strand info.
hg38.gene.annot$chromosome_name <- paste0("chr",hg38.gene.annot$chromosome_name)
hg38.gene.annot$strand[which(hg38.gene.annot$strand==1)] <- "+"
hg38.gene.annot$strand[which(hg38.gene.annot$strand=="-1")] <- "-"

##polish gene annotation
hg38.gene.annot$hgnc_symbol[which(hg38.gene.annot$hgnc_symbol=="")] <- NA

##make GRange object
hg38.gene.annot.GR <- GRanges(seqnames = hg38.gene.annot$chromosome_name,
                              ranges = IRanges(hg38.gene.annot$start_position,
                                               end = hg38.gene.annot$end_position),
                              strand = hg38.gene.annot$strand)

mcols(hg38.gene.annot.GR) <- hg38.gene.annot[,grep("^chromosome_name$|^start_position$|^end_position|^strand$$",
                                                   colnames(hg38.gene.annot),value = T,invert = T)]
hg38.gene.annot.GR <- sort(sortSeqlevels(hg38.gene.annot.GR))

## Extend 2,000bps upstream of promoters
hg38.gene.annot.2000.GR <- punion(promoters(x = hg38.gene.annot.GR,
                                            upstream = 2000,
                                            downstream = 0),
                                  hg38.gene.annot.GR)

hg38.gene.annot.2000.GR$gene_name <- hg38.gene.annot.GR$hgnc_symbol

3.1 CD4-T cells data

seurat <- readRDS(path_to_obj)
seurat_peaks <- seurat@assays$peaks_level_5@ranges
seurat
## An object of class Seurat 
## 93116 features across 16383 samples within 1 assay 
## Active assay: peaks_level_5 (93116 features, 92407 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
plot_dim(seurat, group = "annotation_paper") 

seurat_RNA <- readRDS(path_to_RNA)
seurat_RNA
## An object of class Seurat 
## 37378 features across 52307 samples within 1 assay 
## Active assay: RNA (37378 features, 0 variable features)
##  3 dimensional reductions calculated: pca, umap, harmony
plot_dim(seurat_RNA, group = "annotation_paper") 

3.1.1 Grouping the cells in Non-Tfh & Tfh groups

At low level of resolution, we want to detect the main epigenomic changes between Non-Tfh vs Tfh. For this reason, we decide to group the cells in 4 clusters: Non-Tfh, Tfh, Central Memory and Naive.

seurat@meta.data <- seurat@meta.data %>% mutate(Group =
  case_when(annotation_paper == "Naive" ~ "Naive",
    annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
    annotation_paper == "CM PreTfh" ~ "Central Memory",
    annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
    annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
    annotation_paper == "T-helper" ~ "Non-Tfh",
    annotation_paper == "Tfh T:B border" ~ "Tfh",
    annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
    annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
    annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
    annotation_paper == "Tfh-Mem" ~ "Tfh",
    annotation_paper == "Memory T cells" ~ "Non-Tfh",
    annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
    annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
    annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))

plot_dim(seurat, group = "Group") 

seurat_RNA@meta.data <- seurat_RNA@meta.data %>% mutate(Group =
  case_when(annotation_paper == "Naive" ~ "Naive",
    annotation_paper == "CM Pre-non-Tfh" ~ "Central Memory",
    annotation_paper == "CM PreTfh" ~ "Central Memory",
    annotation_paper == "T-Trans-Mem" ~ "Non-Tfh",
    annotation_paper == "T-Eff-Mem" ~ "Non-Tfh",
    annotation_paper == "T-helper" ~ "Non-Tfh",
    annotation_paper == "Tfh T:B border" ~ "Tfh",
    annotation_paper == "Tfh-LZ-GC" ~ "Tfh",
    annotation_paper == "GC-Tfh-SAP" ~ "Tfh",
    annotation_paper == "GC-Tfh-0X40" ~ "Tfh",
    annotation_paper == "Tfh-Mem" ~ "Tfh",
    annotation_paper == "Memory T cells" ~ "Non-Tfh",
    annotation_paper == "Eff-Tregs" ~ "Non-Tfh",
    annotation_paper == "non-GC-Tf-regs" ~ "Non-Tfh",
    annotation_paper == "GC-Tf-regs" ~ "Non-Tfh"))

plot_dim(seurat_RNA, group = "Group") 

4 Finding differentially expressed genes (DE)

The main goal of this analisys is to extract the differential expressed genes between the groups previously defined.

Idents(seurat_RNA) <- "Group"

#DE <- FindAllMarkers(
 # object = seurat_RNA,
  #logfc.threshold = 0.25,
  #test.use = "wilcox")

#output <- split(DE, DE$cluster)
path_to_save_DE <- here::here("scATAC-seq/results/files/CD4_T/DE_4_groups.xlsx")
#write_xlsx(output, path_to_save_DE)

#DT::datatable(DE)

4.1 Manual selection of genes candidates.

DE <- import_list(path_to_save_DE, setclass = "tbl", rbind = TRUE)

colnames(DE) <- c("p_val", "avg_log2FC", 
                  "pct.1" , "pct.2",  "p_val_adj",
                  "cluster", "gene_name", "_file")
Tfh_genes <- c("TOX2", "PDCD1","CXCL13","TOX","BCL6",
              "GNG4","IL21","SH2D1A","CD200","CXCR5","POU2AF1")

Naive_genes <- c("BACH2","LEF1","CCR7","NOSIP","KLF2","SELL")

Central_memory_genes <- c("ANK3","IL7R","TXNIP","ANXA1",
                          "ZBTB16","GPR183","TIGIT","IL21")

Non_Tfh_genes <- c("LAG3","RORA","IKZF2","KLRB1",
                   "IL2RA","PRDM1","IL1R1","CTLA4",
                   "FOXP3","CCR6","MAF","CCL20","IL1R2") 

target_genes <- c(Naive_genes,Central_memory_genes,
                  Tfh_genes,Non_Tfh_genes)

5 Study the chromatin dynamics in the DE genes

5.1 Extraction of the DE genes coordinates

expr_mat <- AverageExpression(
features = target_genes,
seurat_RNA,
return.seurat = F,
group.by = "Group",
slot = "data")

pheatmap(expr_mat$RNA[target_genes,],
   scale = "row",
   annotation_names_row = F,
   border_color = "white",
   cluster_rows = F,
   cluster_cols = T,
   fontsize_row = 10,
   clustering_distance_rows = "euclidean",
   clustering_distance_cols = "euclidean", 
   clustering_method = "ward.D2") 

#write.table(unique(expr_mat$RNA[target_genes,]), 
 #           quote = F, 
  #          here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_RNA_genes.tsv"))

6 Study the chromatin dynamics in the DE genes

6.1 Extraction of the DE genes coordinates

hg38.gene.annot_targetted <- hg38.gene.annot.2000.GR[which(hg38.gene.annot.2000.GR$gene_name %in% target_genes),]

## Overlapping of the DE genes coordinates with the total number of peaks detected.
gr1 <- seurat_peaks
gr2 <- hg38.gene.annot_targetted
m <- findOverlaps(gr1, gr2)
gr1.matched <- gr1[queryHits(m)]
# Add the metadata from gr2
mcols(gr1.matched) <- cbind.data.frame(
    mcols(gr1.matched),
    mcols(gr2[subjectHits(m)]));

gr1.matched
## GRanges object with 515 ranges and 1 metadata column:
##         seqnames              ranges strand |   gene_name
##            <Rle>           <IRanges>  <Rle> | <character>
##     [1]     chr1 145996409-145997493      * |       TXNIP
##     [2]     chr1 145998144-145998527      * |       TXNIP
##     [3]     chr1 169692335-169693366      * |        SELL
##     [4]     chr1 169694288-169694677      * |        SELL
##     [5]     chr1 169699061-169699292      * |        SELL
##     ...      ...                 ...    ... .         ...
##   [511]     chrX 124328812-124329858      * |      SH2D1A
##   [512]     chrX 124339089-124339448      * |      SH2D1A
##   [513]     chrX 124342323-124342811      * |      SH2D1A
##   [514]     chrX 124346063-124346707      * |      SH2D1A
##   [515]     chrX 124358836-124359410      * |      SH2D1A
##   -------
##   seqinfo: 24 sequences from an unspecified genome; no seqlengths
gr1.matched$peaks <- paste0(seqnames(gr1.matched),"-",
                             start(gr1.matched),"-",
                             end(gr1.matched))

gr1.matched_df <- as.data.frame(gr1.matched)

my_sample_col <- data.frame(Gene = c(gr1.matched$gene_name))
rownames(my_sample_col) = unique(gr1.matched$peaks)

avgexpr_mat <- AverageExpression(
features = unique(gr1.matched$peaks),
seurat,
assays = "peaks_level_5",
return.seurat = F,
group.by = "Group",
slot = "data")

avgexpr_df <- as.data.frame(avgexpr_mat$peaks_level_5)
avgexpr_df$peaks <- row.names(avgexpr_df)

DA_DE_merge <- merge(avgexpr_df,
                       gr1.matched_df[c("peaks","gene_name")],
                       by=c("peaks"))  

DA_DE_merge_melt <- melt(DA_DE_merge)

# Computing the mean accessibility/expression per gene 
mean_accessibility <- tapply(DA_DE_merge_melt$value,
                               list(DA_DE_merge_melt$gene_name, DA_DE_merge_melt$variable),
                                    mean)


out <- pheatmap(mean_accessibility[target_genes,],
         scale = "row",
         border_color = "white",
         cluster_rows = F,
         cluster_cols = T,
         fontsize_row = 10,
         clustering_distance_cols = "euclidean", 
         clustering_method = "ward.D2")

#write.table(unique(mean_accessibility[target_genes,]), 
 #           quote = F, 
  #          here::here("scATAC-seq/results/plots/CD4-T/files_plots/matrix_ATAC_genes.tsv"))
    
DA_DE_merge <- merge(melt(mean_accessibility),
                     melt(expr_mat$RNA),
                           by=c("Var1","Var2"))   

colnames(DA_DE_merge) <- c("Gene" ,"Clusters","Accesibility","Expresion")
DA_DE_merge.melt <- melt(DA_DE_merge)

# Filtering conditions
df_Naive  <- filter(
  DA_DE_merge.melt,Clusters == "Naive" & Gene %in% Naive_genes)
df_CM  <- filter(
  DA_DE_merge.melt,Clusters == "Central Memory" & Gene %in% Central_memory_genes)
df_Tfh  <- filter(
  DA_DE_merge.melt,Clusters == "Tfh" & Gene %in% Tfh_genes)
df_Non_Tfn  <- filter(
  DA_DE_merge.melt,Clusters == "Non-Tfh" & Gene %in% Non_Tfh_genes)

selection_df <- rbind(df_Naive,df_CM,df_Tfh,df_Non_Tfn)
selection_df$value <- scale(selection_df$value)
  
 ggdotchart(selection_df, 
            x="Gene", 
            y="value", 
            add = "segments") +
        coord_flip() + facet_grid(vars(Clusters), vars(variable), scales = "free_y")

7 Master regulators: BCL6 and PRDM1

7.1 BCL6

bcl6 <- hg38.gene.annot.2000.GR[which(hg38.gene.annot.2000.GR$gene_name %in% "BCL6"),]
bcl6_gr <- makeGRangesFromDataFrame(bcl6)

bcl6_plot <- CoveragePlot(
  object = seurat,
  region = bcl6_gr)

bcl6_plot

overlapping_bcl6 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks, 
                                                        bcl6_gr)),]

features <- paste0(seqnames(overlapping_bcl6),"-",
                   start(overlapping_bcl6),"-",
                   end(overlapping_bcl6))

#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/bcl6_heatmaps.pdf"), 
 #   width = 10, 
  #  height = 4)

print(mat_heatmap(seurat = seurat, 
            features = features, 
            group = "annotation_paper",
            cutree_ncols = 3,cutree_nrows = 1))

#dev.off()

7.2 PRDM1

prdm1 <- hg38.gene.annot.2000.GR[which(hg38.gene.annot.2000.GR$gene_name %in% "PRDM1"),]
prdm1_gr <- makeGRangesFromDataFrame(prdm1)

prdm1_plot <-CoveragePlot(
  object = seurat,
  region = prdm1_gr)

prdm1_plot

overlapping_prdm1 <- seurat_peaks[queryHits(findOverlaps(seurat_peaks, prdm1_gr)),]

features <- paste0(seqnames(overlapping_prdm1),"-",
                   start(overlapping_prdm1),"-",
                   end(overlapping_prdm1))

#pdf(file = here::here("scATAC-seq/results/plots/CD4-T/prdm1_heatmaps.pdf"), 
 #   width = 10, 
  #  height = 4)

print(mat_heatmap(seurat = seurat, 
            features = features, 
            group = "annotation_paper",
            cutree_ncols = 3,cutree_nrows = 1))

#dev.off()
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] biomaRt_2.44.4       rio_0.5.16           plyr_1.8.6           writexl_1.4.0        ggpubr_0.4.0         data.table_1.14.0    forcats_0.5.0        stringr_1.4.0        purrr_0.3.4          readr_1.4.0          tidyr_1.1.3          tibble_3.1.2         tidyverse_1.3.0      ggplot2_3.3.5        dplyr_1.0.7          TFBSTools_1.26.0     JASPAR2020_0.99.10   pheatmap_1.0.12      GenomicRanges_1.40.0 GenomeInfoDb_1.24.2  IRanges_2.22.1       S4Vectors_0.26.0     BiocGenerics_0.34.0  Signac_1.2.1         SeuratObject_4.0.2   Seurat_4.0.3         BiocStyle_2.16.1    
## 
## loaded via a namespace (and not attached):
##   [1] rappdirs_0.3.3              SnowballC_0.7.0             rtracklayer_1.48.0          scattermore_0.7             R.methodsS3_1.8.1           bit64_4.0.5                 knitr_1.30                  irlba_2.3.3                 DelayedArray_0.14.0         R.utils_2.10.1              rpart_4.1-15                KEGGREST_1.28.0             RCurl_1.98-1.2              generics_0.1.0              cowplot_1.1.1               RSQLite_2.2.1               RANN_2.6.1                  future_1.21.0               bit_4.0.4                   spatstat.data_2.1-0         xml2_1.3.2                  lubridate_1.7.9             httpuv_1.6.1                SummarizedExperiment_1.18.1 assertthat_0.2.1            DirichletMultinomial_1.30.0 xfun_0.18                   hms_0.5.3                   evaluate_0.14               promises_1.2.0.1            fansi_0.5.0                 progress_1.2.2              caTools_1.18.2              dbplyr_1.4.4                readxl_1.3.1                igraph_1.2.6                DBI_1.1.0                   htmlwidgets_1.5.3           sparsesvd_0.2               spatstat.geom_2.2-0         ellipsis_0.3.2              backports_1.1.10           
##  [43] bookdown_0.21               annotate_1.66.0             deldir_0.2-10               vctrs_0.3.8                 Biobase_2.48.0              here_1.0.1                  ROCR_1.0-11                 abind_1.4-5                 withr_2.4.2                 ggforce_0.3.2               BSgenome_1.56.0             sctransform_0.3.2           GenomicAlignments_1.24.0    prettyunits_1.1.1           goftest_1.2-2               cluster_2.1.0               lazyeval_0.2.2              seqLogo_1.54.3              crayon_1.4.1                pkgconfig_2.0.3             slam_0.1-47                 labeling_0.4.2              tweenr_1.0.1                nlme_3.1-150                rlang_0.4.11                globals_0.14.0              lifecycle_1.0.0             miniUI_0.1.1.1              BiocFileCache_1.12.1        modelr_0.1.8                cellranger_1.1.0            rprojroot_2.0.2             polyclip_1.10-0             matrixStats_0.59.0          lmtest_0.9-38               Matrix_1.3-4                ggseqlogo_0.1               carData_3.0-4               zoo_1.8-9                   reprex_0.3.0                ggridges_0.5.3              png_0.1-7                  
##  [85] viridisLite_0.4.0           bitops_1.0-7                R.oo_1.24.0                 KernSmooth_2.23-17          Biostrings_2.56.0           blob_1.2.1                  parallelly_1.26.1           rstatix_0.6.0               ggsignif_0.6.0              CNEr_1.24.0                 scales_1.1.1                memoise_1.1.0               magrittr_2.0.1              ica_1.0-2                   zlibbioc_1.34.0             compiler_4.0.3              RColorBrewer_1.1-2          fitdistrplus_1.1-5          Rsamtools_2.4.0             cli_3.0.0                   XVector_0.28.0              listenv_0.8.0               patchwork_1.1.1             pbapply_1.4-3               MASS_7.3-53                 mgcv_1.8-33                 tidyselect_1.1.1            stringi_1.6.2               yaml_2.2.1                  askpass_1.1                 ggrepel_0.9.1               grid_4.0.3                  fastmatch_1.1-0             tools_4.0.3                 future.apply_1.7.0          rstudioapi_0.11             TFMPvalue_0.0.8             foreign_0.8-80              lsa_0.73.2                  gridExtra_2.3               farver_2.1.0                Rtsne_0.15                 
## [127] digest_0.6.27               BiocManager_1.30.10         shiny_1.6.0                 pracma_2.2.9                qlcMatrix_0.9.7             Rcpp_1.0.6                  car_3.0-10                  broom_0.7.2                 later_1.2.0                 RcppAnnoy_0.0.18            httr_1.4.2                  AnnotationDbi_1.50.3        colorspace_2.0-2            rvest_0.3.6                 XML_3.99-0.3                fs_1.5.0                    tensor_1.5                  reticulate_1.20             splines_4.0.3               uwot_0.1.10                 RcppRoll_0.3.0              spatstat.utils_2.2-0        plotly_4.9.4.1              xtable_1.8-4                jsonlite_1.7.2              poweRlaw_0.70.6             R6_2.5.0                    pillar_1.6.1                htmltools_0.5.1.1           mime_0.11                   glue_1.4.2                  fastmap_1.1.0               BiocParallel_1.22.0         codetools_0.2-17            utf8_1.2.1                  lattice_0.20-41             spatstat.sparse_2.0-0       curl_4.3.2                  leiden_0.3.8                gtools_3.9.2                zip_2.1.1                   GO.db_3.11.4               
## [169] openxlsx_4.2.4              openssl_1.4.4               survival_3.2-7              rmarkdown_2.5               docopt_0.7.1                munsell_0.5.0               GenomeInfoDbData_1.2.3      haven_2.3.1                 reshape2_1.4.4              gtable_0.3.0                spatstat.core_2.2-0